# keep columns 1,2,5,6,7,9,10,16,17,18,19
p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]
# set column 1 to row names.
rownames(p_e) <- p_e[[1]]
Without doing any reordering We cannot identify any clusters or outliers. The heatmap looks randomized.
#p_e_sc %>% it was not possible to pipe the data since it is a matrix
plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
z = ~p_e_sc, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Without ordering)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
The ordering by euclidean distance produces a heat map that is easier to analyze. At first glance we can perceive four general regions of two groups. The first group heat map color tends towards higher price values (a brighter shade of red) while the second group tend towards lower price values (a darker shade of red/black). Although these groups can be seen in the correlation distance heat map, it is not as clear as the one produced by euclidean distance.
Based on the euclidean distance heat map, net wage tends to higher values from Dubai up towards Tokyo while the number of hours worked decrease. This is the opposite of cities like Delhi,Bankok and Seoul. Interestingly food costs are generally low in the cities with higher working hours. Caracas is an outlier because food costs are high while net wage and the number of hours worked remains low.
We used method “OLO” as the Hierarchical Clustering algorithm instead of “HC” method, because according to the documentation in seriation package and (Hahsler and Buchta 2008) the former does not optimize the Hamiltonian Path Length.
# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")
# by column
p_e_cdist <- dist(t(p_e_sc), method = "euclidean")
# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
ser_row_HC <- seriate(p_e_rdist, method = "OLO")
ser_col_HC <- seriate(p_e_cdist, method = "OLO")
order1 <- get_order(ser_row_HC)
order2 <-get_order(ser_col_HC )
# reorder
p_reord <- p_e_sc[rev(order1), order2]
# plot
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
z = ~p_reord, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
From visual comparison of the heatmap produced by HC solver and TSP, we prefer the latter because there appears to be a separation along the anti diagonal such that cities that have similary high prices are placed along this diagonal. In the heatmap by the TSP solver this distinction is not quite clear.
# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)
ser_row_TSP <- seriate(p_e_rdist, method = "TSP")
set.seed(111)
ser_col_TSP <- seriate(p_e_cdist, method = "TSP")
# orders
ord_q4_1 <- get_order(ser_row_TSP)
order_q4_2 <-get_order(ser_col_TSP)
# reorder
p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
# plot
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
z = ~p_reord_q4, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist- TSP)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
TSP solver has shorter path length (120.9403) compared to HC solver (121.967). Thus it does a better job of optimizing the Hamiltonian Path Length. For measure of effectiveness (ME) the TSP solver (ME = 650.1384) is at a disadvantage compared to the HC (ME = 652.4429). A higher ME implies a better arrangement of the dissimilarity matrix. For gradient measures, since objective is to increase the distance from main diagonal, we infer that HC (Gradient_raw = 65528) does a better job of achieving this compared to TSP solver(Gradient_raw = 38166).
# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# use ser_row_HC because its a simulation, we need to compare the exact same results as used in heatmaps
# use ser_row_TSP, same reason as above
result <- rbind(HC = criterion(p_e_rdist,ser_row_HC ), TSP = criterion(p_e_rdist,ser_row_TSP))
result
2SUM AR_deviations AR_events BAR Cor_R Gradient_raw Gradient_weighted Inertia Lazy_path_length
HC 756120.2 20296.15 26876 19055.56 0.2071334 65528 156151.06 24666913 3713.804
TSP 863212.6 50027.67 40557 19664.06 0.1146549 38166 94761.38 21388093 3898.098
Least_squares LS ME Moore_stress Neumann_stress Path_length RGAR
HC 3352459 894638.7 652.4429 411.5392 239.0233 121.9671 0.2253186
TSP 3434312 935565.1 650.1384 426.7640 242.8195 120.9403 0.3400151
p_e_sc2 %>% plot_ly(type ="parcoords",
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
p_e_sc2 %>% plot_ly(type ="parcoords",
line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
We can identify two clusters defined by Wage net (blue) and iphone 4s (red). Wage net has values greater than 0 in the red cluster (defined by iphone 4) while iphone has values has values greater than -0.5 in the blue cluster. These clusters are difficult to interpret because there is no clear pattern of association between the variables. The most prominent outlier in the blue cluster is the price of Rice per kg, The lines at this variable are furthest apart . It does not seem to fit into any of the two clusters.
Cairo is the most distinct outlier with respect to the price of bread per kg and net wage being very low while average number of work hours are very high.. Prague, Johanessburg and Panama form a cluster the prices are comparatively small and similar across the products.
Among Heatmaps, paralled coordinates and radar charts, heatmaps are relatively easier to interprate in terms of time and accuracy. Radar charts are hard to interprate when the number of variables to compare are high. We found parallel coordiantes to be generally messy to work with.
The points in the scatter plot are very close to each other and so, it is difficult to interpret the results for both <=50K and >50K income levels. The trellis plot shows both income levels in separate panels and so, it is quite easy to analyse the datas. The people with age range of 0-25 who earns <=50K are working for long hours than people earning >50K.
plot1 <- ggplot(df, aes(age,hours_per_week,color=income_level))+labs(title="Scatter plot",x="Age", y = "Hours per week")+
geom_point(size=1)
plot1
plot2<-ggplot(df, aes(age,hours_per_week,color=income_level))+geom_point(size=1)+labs(title="Trellis plot",x="Age", y = "Hours per week")+
facet_wrap(~income_level)
plot2
The density curve is skewed to the right (right skewed distribution) for income level <=50K and thus not normally distributed.The median income level for <=50K is less than the mean. Whereas the the density curve for income of >50K is bimodal. The trellis plot exhibits both income level for people with various marital status. The density curve for widower is normally distributed for both income levels. The people who were never married earnig <=50K lies within range of 17-40 age group.
plot3 <- ggplot(df, aes(x=age, group=income_level, fill=income_level))+
geom_density(alpha=0.5) + labs(title="Density plot of age")
plot3
plot4<-ggplot(df, aes(x=age,group=income_level,fill=income_level))+geom_density(alpha=0.5)+labs(title="Trellis plot of age")+facet_wrap(~marital_status)
plot4
In 3D-scatter plot, the overlapping of data points around the same values makes it difficult to analyse the relationship between given three varaibles. In raster type 2D-density plot, every panels of age group 29-90 exhibits almost same outputs for capital loss ranging 1000-2000 except age group of 17-29.
df1 <- filter(df, cap_loss!=0)
df1 %>% plot_ly(x = ~edu_num, y = ~age, z = ~cap_loss) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital loss')))
plot5 <- ggplot(df1, aes(x=edu_num,y=cap_loss))+stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+labs(title="2D raster type density plot")+theme(legend.position='none')+facet_wrap(~cut_number(df1$age,6))
plot5
The patterns in every panels of trellis plot looks almost similar except the last panel i.e, age group of 48-90. The capital loss seems to be quite high for both age group of 37-48 and 48-90.
plot6a<-ggplot(df, aes(x=edu_num,y=cap_loss,color=cap_loss))+geom_point()+labs(title="Trellis plot")+
facet_grid(cut_number(df$age,4)~.)
plot6a
Agerange<-lattice::equal.count(df$age, number=4, overlap=0.1) #overlap is 10%
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(df$age>=L1$Lower[i] &df$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df4<-df[index,]
df4$Class<-as.factor(Class)
ggplot(df4, aes(x=edu_num,y=cap_loss, color=cap_loss))+ geom_point()+labs(title="Trellis plot with Shingles")+
facet_grid(Class~., labeller = "label_both")
NA
library(ggplot2)
library(seriation)
library(scales)
library(tidyr)
library(dplyr)
library(lattice)
library(plotly)
library(scales)
library(tidyr)
library(gridExtra)
prices_earnings <- read.delim("prices-and-earnings.txt")
# keep columns 1,2,5,6,7,9,10,16,17,18,19
p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]
# set column 1 to row names.
rownames(p_e) <- p_e[[1]]
# Preliminary scaling step to question
# question 2 scaling
p_e_sc <- scale(p_e[,-1])
#p_e_sc %>% it was not possible to pipe the data since it is a matrix
plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
z = ~p_e_sc, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Without ordering)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")
# by column
p_e_cdist <- dist(t(p_e_sc), method = "euclidean")
# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
ser_row_HC <- seriate(p_e_rdist, method = "OLO")
ser_col_HC <- seriate(p_e_cdist, method = "OLO")
order1 <- get_order(ser_row_HC)
order2 <-get_order(ser_col_HC )
# reorder
p_reord <- p_e_sc[rev(order1), order2]
# plot
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
z = ~p_reord, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# computing distance as one minus correlation
p_e_cor <- as.dist((1 - cor(p_e_sc)))
p_e_cor1 <- as.dist((1 - cor(t(p_e_sc))))
# set seed to ensure results are reproducible
set.seed(10111)
# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))
ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))
# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
#plot
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
z = ~p_reord2, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Cor dist)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)
ser_row_TSP <- seriate(p_e_rdist, method = "TSP")
set.seed(111)
ser_col_TSP <- seriate(p_e_cdist, method = "TSP")
# orders
ord_q4_1 <- get_order(ser_row_TSP)
order_q4_2 <-get_order(ser_col_TSP)
# reorder
p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
# plot
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
z = ~p_reord_q4, type = "heatmap",
colors = colorRamp(c("black","red"))
) %>%
layout(title = "Heatmap of prices and earnings (Euclid dist- TSP)",
xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
yaxis = list(title = "Cities", zeroline = FALSE)
)
# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# use ser_row_HC because its a simulation, we need to compare the exact same results as used in heatmaps
# use ser_row_TSP, same reason as above
result <- rbind(HC = criterion(p_e_rdist,ser_row_HC ), TSP = criterion(p_e_rdist,ser_row_TSP))
result
# parallel coordinates plot from unsorted scaled data
p_e_sc2 <- as.data.frame(p_e_sc)
p_e_sc2 <- round(p_e_sc2, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
dimensions = list(
list(label = "Food.Costs...", values = ~Food.Costs...),
list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
list(label = "Clothing.Index", values = ~Clothing.Index),
list(label = "Hours.Worked", values = ~Hours.Worked),
list(label = "Wage.Net", values = ~Wage.Net),
list(label = "Vacation.Days", values = ~Vacation.Days),
list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
)
)
# get the rownames to a column
p_ord_HC <- as.data.frame(p_reord)
# emply list to contain plots
plots=list()
nPlot= nrow(p_ord_HC)
# put the row names to a column
p_ord_HC %>%
add_rownames(var = "group") %>%
mutate_each(funs(rescale), -group) -> p_HC_radar
for (i in 1:nPlot){
plots[[i]] <- htmltools::tags$div(
plot_ly(type = 'scatterpolar',
r=as.numeric(p_HC_radar[i,-1]),
theta= colnames(p_HC_radar)[-1],
fill="toself")%>%
layout(title=p_HC_radar$group[i]), style="width: 25%;")
}
h <-htmltools::tags$div(style = "display: flex; flex-wrap: wrap", plots)
htmltools::browsable(h)
#grid.arrange(grobs = plots, ncol = 3)
df <- read.csv("adult.csv")
colnames(df) <- c("age","workclass","fnlwgt","edu","edu_num","marital_status","occupation","relationship","race","sex","cap_gain","cap_loss","hours_per_week","native_country","income_level")
#View(df)
plot1 <- ggplot(df, aes(age,hours_per_week,color=income_level))+labs(title="Scatter plot",x="Age", y = "Hours per week")+
geom_point(size=1)
plot1
plot2<-ggplot(df, aes(age,hours_per_week,color=income_level))+geom_point(size=1)+labs(title="Trellis plot",x="Age", y = "Hours per week")+
facet_wrap(~income_level)
plot2
plot3 <- ggplot(df, aes(x=age, group=income_level, fill=income_level))+
geom_density(alpha=0.5) + labs(title="Density plot of age")
plot3
plot4<-ggplot(df, aes(x=age,group=income_level,fill=income_level))+geom_density(alpha=0.5)+labs(title="Trellis plot of age")+facet_wrap(~marital_status)
plot4
df1 <- filter(df, cap_loss!=0)
df1 %>% plot_ly(x = ~edu_num, y = ~age, z = ~cap_loss) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital loss')))
plot5 <- ggplot(df1, aes(x=edu_num,y=cap_loss))+stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE)+labs(title="2D raster type density plot")+theme(legend.position='none')+facet_wrap(~cut_number(df1$age,6))
plot5
plot6a<-ggplot(df, aes(x=edu_num,y=cap_loss,color=cap_loss))+geom_point()+labs(title="Trellis plot")+
facet_grid(cut_number(df$age,4)~.)
plot6a
Agerange<-lattice::equal.count(df$age, number=4, overlap=0.1) #overlap is 10%
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(df$age>=L1$Lower[i] &df$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df4<-df[index,]
df4$Class<-as.factor(Class)
ggplot(df4, aes(x=edu_num,y=cap_loss, color=cap_loss))+ geom_point()+labs(title="Trellis plot with Shingles")+
facet_grid(Class~., labeller = "label_both")
Hahsler, Kurt; Michael; Hornik, and Christian Buchta. 2008. “Getting Things in Order: An Introduction to the R Package Seriation.” Journal of Statistical Software, Articles 25 (3): 1–34. doi:10.18637/jss.v025.i03.